{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from data.mit_states import MITStatesDataset\n",
    "\n",
    "mit_train = MITStatesDataset(split='train')\n",
    "mit_test = MITStatesDataset(split='test')\n",
    "datasets = [mit_train, mit_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<language.vocabulary.SimpleVocabulary at 0x7fde7d7d4c90>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from language.vocabulary import SimpleVocabulary\n",
    "from language.tokenizers import BasicTokenizer\n",
    "from language.utils import create_read_func, create_write_func\n",
    "\n",
    "tokenizer = BasicTokenizer()\n",
    "read_func = create_read_func('test.pkl')\n",
    "write_func = create_write_func('test.pkl')\n",
    "SimpleVocabulary.create_and_store_vocabulary_from_datasets(datasets, tokenizer, write_func)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = SimpleVocabulary.create_vocabulary_from_storage(read_func)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: '<pad>',\n",
       " 1: '<unk>',\n",
       " 2: '<str>',\n",
       " 3: '<end>',\n",
       " 4: 'tiny',\n",
       " 5: 'huge',\n",
       " 6: 'old',\n",
       " 7: 'young',\n",
       " 8: 'burnt',\n",
       " 9: 'small',\n",
       " 10: 'large',\n",
       " 11: 'weathered',\n",
       " 12: 'modern',\n",
       " 13: 'unpainted',\n",
       " 14: 'chipped',\n",
       " 15: 'painted',\n",
       " 16: 'ancient',\n",
       " 17: 'dirty',\n",
       " 18: 'frozen',\n",
       " 19: 'spilled',\n",
       " 20: 'fresh',\n",
       " 21: 'melted',\n",
       " 22: 'moldy',\n",
       " 23: 'narrow',\n",
       " 24: 'frayed',\n",
       " 25: 'thin',\n",
       " 26: 'folded',\n",
       " 27: 'wide',\n",
       " 28: 'engraved',\n",
       " 29: 'ruffled',\n",
       " 30: 'thick',\n",
       " 31: 'brushed',\n",
       " 32: 'broken',\n",
       " 33: 'muddy',\n",
       " 34: 'dry',\n",
       " 35: 'eroded',\n",
       " 36: 'barren',\n",
       " 37: 'windblown',\n",
       " 38: 'verdant',\n",
       " 39: 'mossy',\n",
       " 40: 'crushed',\n",
       " 41: 'molten',\n",
       " 42: 'whipped',\n",
       " 43: 'caramelized',\n",
       " 44: 'crumpled',\n",
       " 45: 'wilted',\n",
       " 46: 'pressed',\n",
       " 47: 'crinkled',\n",
       " 48: 'deflated',\n",
       " 49: 'cored',\n",
       " 50: 'coiled',\n",
       " 51: 'rusty',\n",
       " 52: 'cracked',\n",
       " 53: 'draped',\n",
       " 54: 'pierced',\n",
       " 55: 'shiny',\n",
       " 56: 'dented',\n",
       " 57: 'dull',\n",
       " 58: 'blunt',\n",
       " 59: 'curved',\n",
       " 60: 'sharp',\n",
       " 61: 'straight',\n",
       " 62: 'bent',\n",
       " 63: 'clean',\n",
       " 64: 'empty',\n",
       " 65: 'full',\n",
       " 66: 'cluttered',\n",
       " 67: 'grimy',\n",
       " 68: 'diced',\n",
       " 69: 'sliced',\n",
       " 70: 'pureed',\n",
       " 71: 'ripe',\n",
       " 72: 'unripe',\n",
       " 73: 'peeled',\n",
       " 74: 'new',\n",
       " 75: 'browned',\n",
       " 76: 'cooked',\n",
       " 77: 'thawed',\n",
       " 78: 'raw',\n",
       " 79: 'clear',\n",
       " 80: 'steaming',\n",
       " 81: 'heavy',\n",
       " 82: 'lightweight',\n",
       " 83: 'torn',\n",
       " 84: 'shattered',\n",
       " 85: 'fallen',\n",
       " 86: 'creased',\n",
       " 87: 'foggy',\n",
       " 88: 'squished',\n",
       " 89: 'runny',\n",
       " 90: 'viscous',\n",
       " 91: 'cut',\n",
       " 92: 'rough',\n",
       " 93: 'smooth',\n",
       " 94: 'mashed',\n",
       " 95: 'loose',\n",
       " 96: 'tight',\n",
       " 97: 'wet',\n",
       " 98: 'wrinkled',\n",
       " 99: 'worn',\n",
       " 100: 'damp',\n",
       " 101: 'splintered',\n",
       " 102: 'filled',\n",
       " 103: 'dark',\n",
       " 104: 'bright',\n",
       " 105: 'inflated',\n",
       " 106: 'ripped',\n",
       " 107: 'scratched',\n",
       " 108: 'toppled',\n",
       " 109: 'upright',\n",
       " 110: 'short',\n",
       " 111: 'tall',\n",
       " 112: 'murky',\n",
       " 113: 'winding',\n",
       " 114: 'sunny',\n",
       " 115: 'standing',\n",
       " 116: 'closed',\n",
       " 117: 'cloudy',\n",
       " 118: 'open'}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab._id2token"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
